San Francisco Crime Classification

San Francisco crime dataset from Kaggle

Data fields:

  • Dates - timestamp of the crime incident
  • Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
  • Descript - detailed description of the crime incident (only in train.csv)
  • DayOfWeek - the day of the week
  • PdDistrict - name of the Police Department District
  • Resolution - how the crime incident was resolved (only in train.csv)
  • Address - the approximate street address of the crime incident
  • X - Longitude
  • Y - Latitude

In [ ]:
# import all required libraries
library(ggplot2)    # Data visualization
library(scales)     # Plot scaling
library(lattice)

library(data.table) # Much faster data frames
library(dplyr)      # Data aggregation etc.
library(ggmap)      # Load maps from OSM, etc.

In [ ]:
train <- fread('../../data/sf-crime/train.csv', header = TRUE)

In [ ]:
str(train)

Data Preparation


In [ ]:
train$Dates <- as.POSIXct(train$Dates, "PST")
train$Year  <- as.numeric(format(train$Dates, "%Y"))
train$Month <- as.numeric(format(train$Dates, "%m"))

train$Category <- as.factor(train$Category)
train$DayOfWeek <- as.factor(train$DayOfWeek)
train$PdDistrict <- as.factor(train$PdDistrict)

Restrict to year 2010


In [ ]:
#train <- subset(train, Year == 2010)

In [ ]:
agg.cat <- train %>% group_by(Category) %>% summarize(count = n()) %>% arrange(desc(count))
agg.cat$Category <- factor(agg.cat$Category, levels = agg.cat$Category[order(agg.cat$count)]) # order by count
ggplot(agg.cat, aes(x=Category, y=count)) + geom_bar(stat = "identity") + coord_flip() +
    theme(axis.ticks = element_blank(), panel.grid.major.y = element_blank())

In [ ]:
# remove invalid coordinates
train <- subset(train, Y < 40)

In [ ]:
summary(train)

In [ ]:
# compute montly count per Category
train.agg <- train %>% group_by(Year, Month, Category) %>% summarize(count=n())
train.agg$Date <- as.POSIXct(paste(train.agg$Year, train.agg$Month, "01", sep = "-")) # set date to first of month

# set monthly count to 0 for missing values
alldates <- data.frame(Date=with(train.agg, seq(min(Date), max(Date), by="month")))
allcatdates <- merge(alldates, data.frame(Category=levels(train$Category)))
train.agg <- merge(train.agg, allcatdates, by=c("Date", "Category"), all = TRUE)
train.agg[is.na(train.agg$count)]$count <- 0

In [ ]:
# plot montly count per Category
breaks <- seq(as.POSIXct("2003-01-01"), as.POSIXct("2015-01-01"), by="2 years")
ggplot(train.agg, aes(x = Date, y = count, group = Category, col = Category)) +
  geom_line(size=0.2) +
  facet_wrap(~Category, ncol = 5, scales = "free_y") +
  theme(legend.position = "none",
        axis.text = element_text(size = 4),
        strip.text = element_text(size = 5)) +
  scale_y_continuous(limits = c(0, NA)) +
  scale_x_datetime(breaks = breaks, labels = format(breaks, "%Y")) +
  labs(title="Monthly Frequency of Crime Events per Category")

In [ ]:
train <- subset(train, Y < 40)

train <- subset(train, Year == 2010)

sfMap <- get_map("San Francisco", zoom = 12, source="osm", color = "bw")

In [ ]:
ggmap(sfMap) +
  geom_point(data = train, aes(x = X, y = Y, col = PdDistrict), size = 0.1, alpha = 0.3) +
#  facet_wrap(~PdDistrict, ncol = 4) +
  theme(legend.position = "none", axis.text = element_text(size = 5))

In [ ]: